################################################################################
# Evaluation of Finetuning Hate Speech Classifier with different 
# Annotation Datasets. (We use gpt-4o-mini-2024-07-18)
################################################################################
################################################################################
# Libraries
################################################################################
library(dplyr)
library(tidyr)
library(readr)
library(pbmcapply)
library(stringr)
library(tidymodels)
library(caret)
library(lubridate)
library(ggplot2)
library(ggthemes)
library(scales)
library(irr)
library(tidycomm)
library(forestmangr)
library(ggcorrplot)
library(RColorBrewer)
library(cvms)
library(cowplot)
library(binom)
library(boot)
library(xtable)
################################################################################
# Setup
################################################################################
rm(list = ls())

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
parent_path <- getwd()
getwd()

set.seed(123456789)
################################################################################
# Load Data 
################################################################################
# Fine-tuned predictions with 100 rows
#---------------------------------------------------------------
expert_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_1_task_1/ds_1_t_1_trn_100-finetune_chetGPT_hatespeech_experts.csv")
ngo_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_2_task_1/ds_2_t_1_trn_100-finetune_chetGPT_hatespeech_ngo.csv")
appen_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_3_task_1/ds_3_t_1_trn_100-finetune_chetGPT_hatespeech_appen.csv")
citi_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_4_task_1/ds_4_t_1_trn_100-finetune_chetGPT_hatespeech_citi.csv")
prolific_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_5_task_1/ds_5_t_1_trn_100-finetune_chetGPT_hatespeech_prolific.csv")
ras_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_6_task_1/ds_6_t_1_trn_100-finetune_chetGPT_hatespeech_research_assistants.csv")
# Fine-tuned predictions with 250 rows
#---------------------------------------------------------------
expert_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_1_task_1/ds_1_t_1_trn_250-finetune_chetGPT_hatespeech_experts.csv")
ngo_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_2_task_1/ds_2_t_1_trn_250-finetune_chetGPT_hatespeech_ngo.csv")
appen_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_3_task_1/ds_3_t_1_trn_250-finetune_chetGPT_hatespeech_appen.csv")
citi_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_4_task_1/ds_4_t_1_trn_250-finetune_chetGPT_hatespeech_citi.csv")
prolific_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_5_task_1/ds_5_t_1_trn_250-finetune_chetGPT_hatespeech_prolific.csv")
ras_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_6_task_1/ds_6_t_1_trn_250-finetune_chetGPT_hatespeech_research_assistants.csv")

gpt_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_7_task_1/ds_7_t_1_trn_100-finetune_chetGPT_hatespeech_gpt_zero.csv")
gpt_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_7_task_1/ds_7_t_1_trn_250-finetune_chetGPT_hatespeech_gpt_zero.csv")


# Baseline Annotations
#---------------------------------------------------------------
experts <- read_csv("../annotations/experts/experts_for_annots_16062023_master.csv")
experts_long <- read_csv("../annotations/experts/experts_for_annots_16062023_master_long.csv")
appen <- read_csv("../annotations/Appen/appen_set_29062022_for_analysis.csv")
ngo <- read_csv("../annotations/NGO/ngo_set_23062022_for_analysis.csv")
prolific <- read_csv("../annotations/Prolific/prolific_set_06042023_for_analysis.csv")
ras <- read_csv("../annotations/RAs/ra_set_29062022_for_analysis.csv")
citi <- read_csv("../annotations/Citizen Science/288_stop_hate_speech_task_run_csv/stop_hate_speech_task_run.csv")
chatgpt_sensitivity_def_version<- read_csv("../annotations/chatGPT/chatgpt_set_zeroshot_sensitivity_check_for_analysis.csv")
chatgpt_un_def_version<- read_csv("../annotations/chatGPT/chatgpt_set_zeroshot_un_def_for_analysis.csv")
chatgpt_main_def_version <- read_csv("../annotations/chatGPT/chatgpt_set_zeroshot_main_def_for_analysis.csv") 

################################################################################
# ICC of groups 
################################################################################
appen <- appen %>% mutate(unit_var = paste0(newuniqueid))
icr_appen <- test_icr(appen, unit_var = unit_var, coder_var = annotator, ishatespeech)

ngo <- ngo %>% mutate(unit_var = paste0(newuniqueid))
icr_ngo <- test_icr(ngo, unit_var = unit_var, coder_var = annotator, ishatespeech)

prolific <- prolific %>% mutate(unit_var = paste0(newuniqueid))
icr_proli <- test_icr(prolific, unit_var = unit_var, coder_var = annotator, ishatespeech)

ras <- ras %>% mutate(unit_var = paste0(newuniqueid))
icr_reass <- test_icr(ras, unit_var = unit_var, coder_var = annotator, ishatespeech)

citi <- citi %>% mutate(unit_var = paste0(newuniqueid))
icr_citi <- test_icr(citi, unit_var = unit_var, coder_var = annotator, ishatespeech)

chatgpt_un_def_version <- chatgpt_un_def_version %>% mutate(unit_var = paste0(newuniqueid))
icr_chat_n <- test_icr(chatgpt_un_def_version, unit_var = unit_var, coder_var = annotator, ishatespeech)

chatgpt_sensitivity_def_version <- chatgpt_sensitivity_def_version %>% mutate(unit_var = paste0(newuniqueid))
icr_chat_s <- test_icr(chatgpt_sensitivity_def_version, unit_var = unit_var, coder_var = annotator, ishatespeech)

chatgpt_main_def_version <- chatgpt_main_def_version %>% mutate(unit_var = paste0(newuniqueid))
icr_chat_m <- test_icr(chatgpt_main_def_version, unit_var = unit_var, coder_var = annotator, ishatespeech)

experts_long <- experts_long %>% mutate(unit_var = paste0(newuniqueid))
icr_experts <- test_icr(experts_long, unit_var = unit_var, coder_var = annotator, ishatespeech)

icr_table <- dplyr::bind_rows(icr_experts,icr_reass,icr_ngo,icr_proli,icr_appen,icr_citi,icr_chat_n,icr_chat_s,icr_chat_m) %>% round_df(., digits = 3)
icr_table$Platform <- c("Experts", "Research Assistants",  "NGO",  "Prolific",  "Appen", "Citizen Science", "Chat GPT (United Nations Definition)", "Chat GPT (Sensitivity Check)","Chat GPT")
################################################################################
# Merge Annotation Sets
################################################################################
# Make all sets to wide format...
# Change Name of isHateSpeech values for all annotators to something of use...
appen_wide <- appen %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group)) %>%
  rename(ishatespeech_1 = ishatespeech_appen_1,
         ishatespeech_2 = ishatespeech_appen_2,
         ishatespeech_3 = ishatespeech_appen_3,
         target_group_1 = target_group_appen_1,
         target_group_2 = target_group_appen_2,
         target_group_3 = target_group_appen_3) %>%
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

ngo_wide <- ngo %>% group_by(newuniqueid) %>% 
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group)) %>% 
  rename(ishatespeech_1 = ishatespeech_af_1,
         ishatespeech_2 = ishatespeech_af_2,
         ishatespeech_3 = ishatespeech_af_3,
         target_group_1 = target_group_af_1,
         target_group_2 = target_group_af_2,
         target_group_3 = target_group_af_3) %>%
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

prolific_wide <- prolific %>% group_by(newuniqueid) %>% 
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group)) %>%
  rename(ishatespeech_1 = ishatespeech_prolific1,
         ishatespeech_2 = ishatespeech_prolific2,
         ishatespeech_3 = ishatespeech_prolific3,
         target_group_1 = target_group_prolific1,
         target_group_2 = target_group_prolific2,
         target_group_3 = target_group_prolific3) %>%
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

ras_wide <- ras %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group)) %>% 
  rename(ishatespeech_1 = ishatespeech_ra_1,
         ishatespeech_2 = ishatespeech_ra_2,
         ishatespeech_3 = ishatespeech_ra_3,
         target_group_1 = target_group_ra_1,
         target_group_2 = target_group_ra_2,
         target_group_3 = target_group_ra_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

citi_wide <- citi %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group)) %>%
  rename(ishatespeech_1 = ishatespeech_1,
         ishatespeech_2 = ishatespeech_2,
         ishatespeech_3 = ishatespeech_3,
         target_group_1 = target_group_1,
         target_group_2 = target_group_2,
         target_group_3 = target_group_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

chat_wide_n <- chatgpt_un_def_version %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group)) %>%
  rename(ishatespeech_1 = ishatespeech_1,
         ishatespeech_2 = ishatespeech_2,
         ishatespeech_3 = ishatespeech_3,
         target_group_1 = target_group_1,
         target_group_2 = target_group_2,
         target_group_3 = target_group_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

chat_wide_s <- chatgpt_sensitivity_def_version %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group)) %>%
  rename(ishatespeech_1 = ishatespeech_1,
         ishatespeech_2 = ishatespeech_2,
         ishatespeech_3 = ishatespeech_3,
         target_group_1 = target_group_1,
         target_group_2 = target_group_2,
         target_group_3 = target_group_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))


chat_wide_m <- chatgpt_main_def_version %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group)) %>%
  rename(ishatespeech_1 = ishatespeech_1,
         ishatespeech_2 = ishatespeech_2,
         ishatespeech_3 = ishatespeech_3,
         target_group_1 = target_group_1,
         target_group_2 = target_group_2,
         target_group_3 = target_group_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))


experts_wide <- experts %>% mutate(unit_var = paste0(ArticleID,ID)) %>%
  select(c(ArticleID,ID,Kommentar,Titel,Text,group,unit_var,`Hate Speech_KD`,`Hate Speech_FG`,`Hate Speech_SK`,
           `Target Group_KD`,`Target Group_FG`,`Target Group_SK`,`Konsensus HS`,`Konsensus Target 1`,`Difficult case?`))

colnames(experts_wide) <- tolower(names(experts_wide))
colnames(experts_wide) <- c("articleid","id","kommentar","titel","text","group","unit_var","ishatespeech_1","ishatespeech_2","ishatespeech_3","target_group_1","target_group_2","target_group_3","ishate_combined","target_combined","difficult_case")

experts_wide <- experts_wide %>% mutate(target_combined = case_when(
  target_combined == "Religion" ~ "religion",
  target_combined %in% c("Nationalität/Hautfarbe/Herkunft", "Herkunft") ~ "nationality",
  target_combined %in% c("Geschlecht", "Gechlecht") ~ "sex",
  target_combined %in% c("Politische Einstellung", "Pol. Einstellung") ~ "politics",
  target_combined == "Sexualität" ~ "sexuality",
  target_combined == "Sozialer Status/Bildung/Einkommen/Berufsgruppe" ~ "social_status",
  target_combined == "Andere" ~ "other",
  target_combined == "Alter" ~ "age",
  TRUE ~ target_combined
))

# Merge sets
df_1 <- dplyr::bind_rows(experts_wide,ras_wide,ngo_wide,appen_wide,prolific_wide,citi_wide,chat_wide_n,chat_wide_s,chat_wide_m)

################################################################################
# Next ICR 
################################################################################
df_1 <- df_1 %>% ungroup()
icr_combined <- test_icr(df_1, unit_var = unit_var, coder_var = group, ishate_combined) %>% mutate(Platform = "Combined ICR")

icr_table <- dplyr::bind_rows(icr_table, icr_combined) %>% round_df(., digits = 3)

# Add Agreement seen from the point of View of Gold Group (RA's at the moment)
is_hate_filter <- experts_wide %>% filter(ishate_combined == 1)
no_hate_filter <- experts_wide %>% filter(ishate_combined == 0)

two_way <- list(is_hate_filter$unit_var, no_hate_filter$unit_var)
tow_way_name <- c("Agreement Hate Speech Only", "Agreement No Hate Speech Only")
tow_way_name_unit <- c("n_Units Hate Speech", "n_Units No Hate Speech")
tow_way_krip <- c("Krippendorffs_Alpha Hate Speech", "Krippendorffs_Alpha No Hate Speech")
tow_way_holist <- c("Holstis_CR Hate Speech", "Holstis_CR No Hate Speech")

for(i in 1:length(two_way)){
  experts_long_sub <- experts_long %>% filter(unit_var %in% two_way[[i]])
  icr_experts <- test_icr(experts_long_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  appen_sub <- appen %>% filter(unit_var %in% two_way[[i]])
  icr_appen <- test_icr(appen_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  ngo_sub <- ngo %>% filter(unit_var %in% two_way[[i]])
  icr_ngo <- test_icr(ngo_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  prolific_sub <- prolific %>% filter(unit_var %in% two_way[[i]])
  icr_proli <- test_icr(prolific_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  ras_sub <- ras %>% filter(unit_var %in% two_way[[i]])
  icr_reass <- test_icr(ras_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  citi_sub <- citi %>% filter(unit_var %in% two_way[[i]])
  icr_citi <- test_icr(citi_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  chatgpt_sub_n <- chatgpt_un_def_version %>% filter(unit_var %in% two_way[[i]])
  icr_chat_n <- test_icr(chatgpt_sub_n, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  chatgpt_sub_s <- chatgpt_sensitivity_def_version %>% filter(unit_var %in% two_way[[i]])
  icr_chat_s <- test_icr(chatgpt_sub_s, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  chatgpt_sub_m <- chatgpt_main_def_version %>% filter(unit_var %in% two_way[[i]])
  icr_chat_m <- test_icr(chatgpt_sub_m, unit_var = unit_var, coder_var = annotator, ishatespeech)
  
  df_1_sub <- df_1 %>% filter(unit_var %in% two_way[[i]])
  icr_combined <- test_icr(df_1_sub, unit_var = unit_var, coder_var = group, ishate_combined) %>% mutate(Platform = "Combined ICR")
  
  icr_twoway <-  dplyr::bind_rows(icr_experts, icr_reass,icr_ngo,icr_proli,icr_appen,icr_citi,icr_chat_n, icr_chat_s, icr_chat_m, icr_combined) %>% round_df(., digits = 3)
  
  icr_twoway <- icr_twoway %>% dplyr::select(c(n_Units,Agreement,Holstis_CR, Krippendorffs_Alpha))
  colnames(icr_twoway) <- c(tow_way_name_unit[i], tow_way_name[i], tow_way_holist[i], tow_way_krip[i])
  
  icr_table <- bind_cols(icr_table,icr_twoway)
}

icr_table$Variable <- NULL

stargazer::stargazer(icr_table, digits = 2, summary = F, out.header = F, out = "../img_gpt_4o/icr_hatespeech_table_full_2_classes.html", type = "html")
write_csv(icr_table, "../img_gpt_4o/icr_table_full_2_classes.csv")

################################################################################
# Accuracy for each group
################################################################################
df_2 <- df_1 %>% group_by(newuniqueid) %>%
  select(-c(titel,text,kommentar,difficult_case)) %>%
  pivot_wider(names_from = group,
              values_from = c(ishatespeech_1,ishatespeech_2,
                              ishatespeech_3,target_group_1,
                              target_group_3,target_group_2,
                              ishate_combined,
                              target_combined))

names(df_2)

# Recodf_2# Recoding the predictions and base group annotations
group_cols <- c("ishate_combined_ra","ishate_combined_af", "ishate_combined_citi", 
                "ishate_combined_Prolific", "ishate_combined_appen", "ishate_combined_chatGPT (Sensitivity Definition)", "ishate_combined_chatGPT (United Nations Definition)", "ishate_combined_chatGPT")

group_cols_gpt_reduce <- c("ishate_combined_ra","ishate_combined_af", "ishate_combined_citi", 
                           "ishate_combined_Prolific", "ishate_combined_appen", "ishate_combined_chatGPT")


base_group = "ishate_combined_experts"

#Get examples where all agree and where all disagree! 
# Identify rows where all groups agree with the base group
df_2$all_agree <- rowSums(df_2[group_cols] == df_2[[base_group]]) == length(group_cols)
# Identify rows where none of the groups agree with the base group
df_2$none_agree <- rowSums(df_2[group_cols] != df_2[[base_group]]) == length(group_cols_gpt_reduce)


df_samples_all_agree <- df_2 %>% filter(all_agree == T)
df_samples_all_agree <- citi_wide %>% filter(articleid %in% df_samples_all_agree$articleid) %>% select(c(newuniqueid,kommentar,titel,text,ishate_combined)) %>% group_by(ishate_combined) %>% dplyr::slice_sample(n = 5) %>% rename(ishate_experts = ishate_combined)
write_csv(df_samples_all_agree, "../img_gpt_4o/examples_where_all_agree_2_classes.csv")

df_samples_none_agree <- df_2 %>% filter(none_agree == T)
df_samples_none_agree <- experts_wide %>% filter(articleid %in% df_samples_none_agree$articleid) %>% select(c(newuniqueid,kommentar,titel,text,ishate_combined)) %>% group_by(ishate_combined) %>% dplyr::slice_sample(n = 5) %>% rename(ishate_experts = ishate_combined)
write_csv(df_samples_none_agree, "../img_gpt_4o/examples_where_none_agree_2_classes.csv")



friendly_names <- c("Research Assistants", "NGO", "Citizen Science", "Prolific", "Appen", "chatGPT (Sensitivity Definition)", "chatGPT (United Nations Definition)", "chatGPT")

conf_df_a <- df_2 %>%
  mutate(across(all_of(c(group_cols, base_group)),
                ~ifelse(. == 1, "Yes", "No")))

# Placeholder for bootstrapped CI calculations
calculate_boot_ci <- function(data, base_group, pred_col) {
  # Define bootstrapping statistic function
  boot_stat_func <- function(data, indices) {
    # Extract bootstrap sample
    sample <- data[indices, ]
    # Calculate metrics
    eval_res <- cvms::evaluate(sample,
                               prediction_col= pred_col,
                               target_col = base_group,
                               type = "binomial")
    # Return the metrics
    return(with(eval_res, c(Accuracy = Accuracy, F1 = F1, Recall = Sensitivity, Precision = `Pos Pred Value`)))
  }
  
  # Perform bootstrapping
  boot_res <- boot(data, boot_stat_func, R = 1000, parallel = "multicore", ncpus = 8)
  
  # Calculate confidence intervals for each metric
  ci_list <- lapply(1:ncol(boot_res$t), function(i) {
    boot.ci(boot_res, type = "perc", index = i)$percent[4:5]
  })
  names(ci_list) <- c("Accuracy", "F1", "Sensitivity", "PosPredValue")
  
  return(ci_list)
}

# Initialize the list for storing evaluation results and CIs
conf_mat_no_fine_tuning <- list()
result_df <- NULL

# Loop over each group to evaluate and calculate CIs
for (group_col in group_cols) {
  # Evaluate metrics using cvms::evaluate
  tmp <- conf_df_a %>% ungroup %>%  select(base_group,group_col)
  eval_results <- cvms::evaluate(tmp,
                                 prediction_col = group_col,
                                 target_col = base_group,
                                 type = "binomial")
  
  # Store evaluation results
  conf_mat_no_fine_tuning[[group_col]] <- eval_results
  
  # Calculate CIs using bootstrapping
  ci_results <- calculate_boot_ci(tmp, base_group, group_col)
  
  # Store results with CIs
  row_tmp <- c(group_col,eval_results$Accuracy, eval_results$F1, eval_results$Sensitivity, eval_results$`Pos Pred Value`,
                   ci_results$Accuracy[1], ci_results$Accuracy[2], ci_results$F1[1], ci_results$F1[2],
                   ci_results$Sensitivity[1], ci_results$Sensitivity[2], ci_results$PosPredValue[1], ci_results$PosPredValue[2])
  
  # Define the names for each component of the row vector
  metric_names <- c("Group","Accuracy", "F1", "Sensitivity", "Precision",
                    "AccuracyLowerCI", "AccuracyUpperCI", 
                    "F1LowerCI", "F1UpperCI",
                    "SensitivityLowerCI", "SensitivityUpperCI", 
                    "PrecisionLowerCI", "PrecisionUpperCI")
  
  # Assign names to the row vector
  names(row_tmp) <- metric_names
  
  result_df <- dplyr::bind_rows(result_df, row_tmp)
}

# Setting friendly names for the groups in the results
names(conf_mat_no_fine_tuning) <- friendly_names
result_df$Group <- friendly_names
# Add a 'Type' column to the original DataFrame with "human labels"
result_df$Type <- "Accuracy of Human\nAnnotations"
################################################################################
# Evaluate the finetuned models!
################################################################################
# Recode Hate Speech & Toxic Speech to Hate Speech simple Binary Distinction! 
# List of dataset names
datasets <- c("expert_df_100", "ngo_df_100", "appen_df_100", "citi_df_100", "prolific_df_100", "ras_df_100", "gpt_df_100",
              "expert_df_250", "ngo_df_250", "appen_df_250", "citi_df_250", "prolific_df_250", "ras_df_250", "gpt_df_250")

# Function to recode the 'prediction' column
recode_prediction <- function(df) {
  df$prediction <- ifelse(df$prediction %in% c("HATE SPEECH", "TOXIC SPEECH"), "HATE SPEECH", "KEINE HATE SPEECH")
  df$label_column <- ifelse(df$label_column %in% c(1, 2), 1, 0)
  return(df)
}

# Apply the function to each dataset
for (dataset in datasets) {
  assign(dataset, recode_prediction(get(dataset)))
}


df_list <- list(expert_df_100,ngo_df_100,appen_df_100,citi_df_100,prolific_df_100,ras_df_100,gpt_df_100,
                expert_df_250,ngo_df_250,appen_df_250,citi_df_250,prolific_df_250,ras_df_250,gpt_df_250)


friendly_names <-  c("Experts 100", "NGO 100", "Appen 100", "Citizen Science 100", "Prolific 100", "Research Assistants 100", "chatGPT 100",
                     "Experts 250", "NGO 250", "Appen 250", "Citizen Science 250", "Prolific 250", "Research Assistants 250", "chatGPT 250")

# Placeholder for bootstrapped CI calculations
calculate_boot_ci <- function(data) {
  # Define bootstrapping statistic function
  boot_stat_func <- function(data, indices) {
    # Extract bootstrap sample
    sample <- data[indices, ]
    # Calculate metrics
    eval_res <- cvms::evaluate(sample,
                               prediction_col= "prediction",
                               target_col = "label_column",
                               type = "binomial")
    # Return the metrics
    return(with(eval_res, c(Accuracy = Accuracy, F1 = F1, Recall = Sensitivity, Precision = `Pos Pred Value`)))
  }
  
  # Perform bootstrapping
  boot_res <- boot(data, boot_stat_func, R = 1000, parallel = "multicore", ncpus = 8)
  
  # Calculate confidence intervals for each metric
  ci_list <- lapply(1:ncol(boot_res$t), function(i) {
    boot.ci(boot_res, type = "perc", index = i)$percent[4:5]
  })
  names(ci_list) <- c("Accuracy", "F1", "Sensitivity", "PosPredValue")
  
  return(ci_list)
}

# Initialize the list for storing evaluation results and CIs
conf_mat_with_fine_tuning <- list()
result_df_fine <- NULL

# Loop over each group to evaluate and calculate CIs
for (i in 1:length(df_list)) {
  # Evaluate metrics using cvms::evaluate
  tmp <- df_list[[i]] %>% 
    mutate(prediction = ifelse(prediction == "HATE SPEECH", 1, 0))
  
  eval_results <- cvms::evaluate(tmp,
                                 prediction_col = "prediction",
                                 target_col = "label_column",
                                 type = "binomial")
  
  # Store evaluation results
  conf_mat_with_fine_tuning[[friendly_names[i]]] <- eval_results
  
  # Calculate CIs using bootstrapping
  ci_results_fine <- calculate_boot_ci(tmp)
  
  # Store results with CIs
  row_tmp <- c(friendly_names[i],eval_results$Accuracy, eval_results$F1, eval_results$Sensitivity, eval_results$`Pos Pred Value`,
               ci_results_fine$Accuracy[1], ci_results_fine$Accuracy[2], ci_results_fine$F1[1], ci_results_fine$F1[2],
               ci_results_fine$Sensitivity[1], ci_results_fine$Sensitivity[2], ci_results_fine$PosPredValue[1], ci_results_fine$PosPredValue[2])
  
  # Define the names for each component of the row vector
  metric_names <- c("Group","Accuracy", "F1", "Sensitivity", "Precision",
                    "AccuracyLowerCI", "AccuracyUpperCI", 
                    "F1LowerCI", "F1UpperCI",
                    "SensitivityLowerCI", "SensitivityUpperCI", 
                    "PrecisionLowerCI", "PrecisionUpperCI")
  
  # Assign names to the row vector
  names(row_tmp) <- metric_names
  
  result_df_fine <- dplyr::bind_rows(result_df_fine, row_tmp)
}

Type = rep(c("GPT-4o-mini Accuracy after\nfine-tuning on 100 labels", "GPT-4o-mini Accuracy after\nfine-tuning on 250 labels"), each = 7)
result_df_fine$Type <- Type


# Setting friendly names for the groups in the results
names(conf_mat_with_fine_tuning) <- friendly_names


# Combine the two DataFrames
result_df <- dplyr::bind_rows(result_df, result_df_fine)
result_df <- result_df %>% mutate(Group = gsub("\\s100|\\s250", "", Group)) %>%
                           mutate(across(.cols = 2:13, .fns = as.numeric))

result_df

write_csv(result_df, "../img_gpt_4o/classification_metrics_all_2_classes.csv")
################################################################################
# Figure accuracy for each group
################################################################################
# Filter out 'chatGPT' from the DataFrame and get its accuracy
chatGPT_acc <- result_df %>% dplyr::filter(Group %in% c("chatGPT") & Type == "Accuracy of Human\nAnnotations")
chatGPT_lower <- as.numeric(chatGPT_acc$AccuracyLowerCI)
chatGPT_upper <- as.numeric(chatGPT_acc$AccuracyUpperCI)
chatGPT_acc <- as.numeric(chatGPT_acc$Accuracy)


filtered_results_df <- subset(result_df, Group != "chatGPT")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (Sensitivity Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (United Nations Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "Experts")


if(FALSE){
  # Adjust the levels of 'Type' so that human annotations come first
  filtered_results_df$Type <- factor(filtered_results_df$Type, 
                                     levels = c("GPT-4o-mini Accuracy after\nfine-tuning on 250 labels",
                                                "GPT-4o-mini Accuracy after\nfine-tuning on 100 labels",
                                                "Accuracy of Human\nAnnotations"))
  
}

if(FALSE){
  # Convert 'Group' to an ordered factor based on mean accuracy
  group_order <- filtered_results_df %>% 
    group_by(Group) %>% 
    summarise(mean_acc = mean(Accuracy)) %>% 
    arrange(-mean_acc) %>% 
    pull(Group)
  
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = group_order)
}


if(FALSE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Accuracy of Human\nAnnotations") %>%
    arrange(desc(Accuracy))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = human_annotations$Group)
}

if(TRUE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Accuracy of Human\nAnnotations") %>%
    arrange(desc(Accuracy))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = c("Research Assistants", "Prolific", "Citizen Science", "Appen", "NGO"))
}


# Plot
pl2 <- filtered_results_df %>% filter(Group != "Experts") %>%
  ggplot(aes(x = Group, y = Accuracy, fill = Type)) +
  geom_rect(aes(xmin = -Inf, xmax = 5.5, ymin = chatGPT_lower, ymax = chatGPT_upper), 
            fill = "lightblue", alpha = 0.2) +
  geom_bar(stat = "identity", position = position_dodge2(width = 1, padding = 0.1)) +
  geom_errorbar(aes(ymin = AccuracyLowerCI, ymax = AccuracyUpperCI, group = interaction(Group, Type)), 
                stat = "identity",
                width = .5, alpha = 0.9,
                position = position_dodge(width = 0.9)) +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_acc, yend = chatGPT_acc), 
               color = "black", 
               linewidth = 0.66, 
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_lower, yend = chatGPT_lower), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_upper, yend = chatGPT_upper), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  labs(y = "Accuracy [%]", x = "", fill = "") +
  annotate("text", x = 5.65, y = chatGPT_acc, label = paste("GPT-4o: ", scales::percent(chatGPT_acc)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  #annotate("text", x = 5.65, y = chatGPT_acc, label = paste("GPT-4o-mini: ", round(chatGPT_acc, digits = 2)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  scale_y_continuous(labels = scales::percent_format(), expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  #scale_y_continuous(expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_x_discrete(expand = expand_scale(mult = c(0.1, 0.2))) +
  scale_fill_brewer(palette = "Dark2") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "bottom",
        axis.text.y = element_text(size = 18, face = "bold", color = "black"),
        axis.text.x = element_text(size = 14, color = "black"),
        axis.title.x = element_text(size = 18, color = "black", face = "bold"),
        legend.text = element_text(size = 14),
        plot.margin = unit(c(2, .5, .5, .5), "lines"))

# Show plot
pl2

# Save plot
ggsave(plot = pl2, filename = "../img_gpt_4o/accuracy_groups_vs_experts_with_types_refined_and_baseline_2_classes.pdf", width = 12, height = 9, dpi = 300, device = cairo_pdf)
ggsave(plot = pl2, filename = "../img_gpt_4o/accuracy_groups_vs_experts_with_types_refined_and_baseline_2_classes.png", width = 12, height = 9, dpi = 300, bg = "White")


################################################################################
# Confusion Matrix
################################################################################
# Filter out 'chatGPT' from the DataFrame and get its accuracy
result_df <- result_df %>% mutate(Type = ifelse(Type == "Accuracy of Human\nAnnotations", "Human\nAnnotations",
                                                ifelse(Type == "GPT-4o-mini Accuracy after\nfine-tuning on 100 labels", "after\nfine-tuning on 100 labels", "after\nfine-tuning on 250 labels")))
chatGPT_pre <- result_df %>% dplyr::filter(Group == "chatGPT" & Type == "Human\nAnnotations")
chatGPT_pre_lower <- as.numeric(chatGPT_pre$PrecisionLowerCI)
chatGPT_pre_upper <- as.numeric(chatGPT_pre$PrecisionUpperCI)
chatGPT_pre <- as.numeric(chatGPT_pre$Precision)

filtered_results_df <- subset(result_df, Group != "chatGPT")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (Sensitivity Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (United Nations Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "Experts")

if(FALSE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = human_annotations$Group)
}

if(TRUE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = c("Research Assistants", "Prolific", "Citizen Science", "Appen", "NGO"))
}

# Plot
pl3 <- filtered_results_df %>% dplyr::mutate(Type = ifelse(Type == "Human\nAnnotations", "Precision of Human\nAnnotations", paste0("GPT-4o Precision ",Type))) %>%
                     dplyr::mutate(Type = factor(Type, 
                                                 levels = c("Precision of Human\nAnnotations",
                                                            "GPT-4o Precision after\nfine-tuning on 100 labels",
                                                            "GPT-4o Precision after\nfine-tuning on 250 labels"))) %>%
                     dplyr::filter(Group != "chatGPT") %>% 
  ggplot(aes(x = Group, y = Precision, fill = Type)) +
  geom_rect(aes(xmin = -Inf, xmax = 5.5, ymin = chatGPT_pre_lower , ymax = chatGPT_pre_upper), 
            fill = "lightblue", alpha = 0.2) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9, padding = 0.1)) +
  geom_errorbar(aes(ymin = PrecisionLowerCI, ymax = PrecisionUpperCI, group = interaction(Group, Type)), 
                stat = "identity",
                width = .5, alpha = 0.9,
                position = position_dodge(width = 0.9)) +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_pre, yend = chatGPT_pre), 
               color = "black", 
               linewidth = 0.66, 
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_pre_lower, yend = chatGPT_pre_lower), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_pre_upper, yend = chatGPT_pre_upper), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  labs(y = "Precision", x = "", fill = "") +
  #annotate("text", x = 5.60, y = chatGPT_precision, label = paste("GPT-4o-mini: ", scales::percent(chatGPT_precision)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  annotate("text", x = 5.60, y = chatGPT_pre, label = paste("GPT-4o: ", round(chatGPT_pre, digits = 2)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  #scale_y_continuous(labels = scales::percent_format(), expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_y_continuous(expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_x_discrete(expand = expand_scale(mult = c(0.1, 0.2))) +
  scale_fill_brewer(palette = "Dark2") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "bottom",
        axis.text.y = element_text(size = 16, face = "bold", color = "black"),
        axis.text.x = element_text(size = 12, color = "black"),
        axis.title.x = element_text(size = 16, color = "black", face = "bold"),
        legend.text = element_text(size = 12),
        plot.margin = unit(c(2, .5, .5, .5), "lines"))

# Show plot
pl3


chatGPT_rec <- result_df %>% dplyr::filter(Group == "chatGPT" & Type == "Human\nAnnotations")
chatGPT_rec_lower <- as.numeric(chatGPT_rec$SensitivityLowerCI)
chatGPT_rec_upper <- as.numeric(chatGPT_rec$SensitivityUpperCI)
chatGPT_rec <- as.numeric(chatGPT_rec$Sensitivity) 
chatGPT_rec_pos <- chatGPT_rec - 0.1

filtered_results_df <- subset(result_df, Group != "chatGPT")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (Sensitivity Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (United Nations Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "Experts")

if(FALSE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = human_annotations$Group)
}

if(TRUE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = c("Research Assistants", "Prolific", "Citizen Science", "Appen", "NGO"))
}

# Plot
pl4 <- filtered_results_df %>% dplyr::mutate(Type = ifelse(Type == "Human\nAnnotations", "Recall of Human\nAnnotations", paste0("GPT-4o Recall ",Type))) %>%
  dplyr::mutate(Type = factor(Type, 
                              levels = c("Recall of Human\nAnnotations",
                                         "GPT-4o Recall after\nfine-tuning on 100 labels",
                                         "GPT-4o Recall after\nfine-tuning on 250 labels"))) %>%
  dplyr::filter(Group != "chatGPT") %>% 
  ggplot(aes(x = Group, y = Sensitivity, fill = Type)) +
  geom_rect(aes(xmin = -Inf, xmax = 5.5, ymin = chatGPT_rec_lower , ymax = chatGPT_rec_upper), 
            fill = "lightblue", alpha = 0.2) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9, padding = 0.1)) +
  geom_errorbar(aes(ymin = SensitivityLowerCI, ymax = SensitivityUpperCI, group = interaction(Group, Type)), 
                stat = "identity",
                width = .5, alpha = 0.9,
                position = position_dodge(width = 0.9)) +
  geom_segment(aes(x = -Inf, xend = 5.4, y = chatGPT_rec, yend = chatGPT_rec), 
               color = "black", 
               linewidth = .66, 
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_rec_lower, yend =chatGPT_rec_lower), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_rec_upper, yend = chatGPT_rec_upper), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  labs(y = "Recall", x = "", fill = "") +
  #annotate("text", x = 5.60, y = chatGPT_recall, label = paste("GPT-4o-mini: ", scales::percent(chatGPT_recall)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  annotate("text", x = 5.60, y = chatGPT_rec_pos, label = paste("GPT-4o: ", round(chatGPT_rec, digits = 2)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  #scale_y_continuous(labels = scales::percent_format(), expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_y_continuous(expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_x_discrete(expand = expand_scale(mult = c(0.1, 0.2))) +
  scale_fill_brewer(palette = "Dark2") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "bottom",
        axis.text.y = element_text(size = 16, face = "bold", color = "black"),
        axis.text.x = element_text(size = 12, color = "black"),
        axis.title.x = element_text(size = 16, color = "black", face = "bold"),
        legend.text = element_text(size = 12),
        plot.margin = unit(c(2, .5, .5, .5), "lines"))

# Show plot
pl4

chatGPT_f1 <- result_df %>% dplyr::filter(Group == "chatGPT" & Type == "Human\nAnnotations")
chatGPT_f1_lower <- as.numeric(chatGPT_f1$F1LowerCI)
chatGPT_f1_upper <- as.numeric(chatGPT_f1$F1UpperCI)
chatGPT_f1 <- as.numeric(chatGPT_f1$F1)


filtered_results_df <- subset(result_df, Group != "chatGPT")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (Sensitivity Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (United Nations Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "Experts")


if(FALSE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = human_annotations$Group)
}

if(TRUE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = c("Research Assistants", "Prolific", "Citizen Science", "Appen", "NGO"))
}

# Plot
pl5 <- filtered_results_df %>% dplyr::mutate(Type = ifelse(Type == "Human\nAnnotations", "F1-Score of Human\nAnnotations", paste0("GPT-4o F1-Score ",Type))) %>%
  dplyr::mutate(Type = factor(Type, 
                              levels = c("F1-Score of Human\nAnnotations",
                                         "GPT-4o F1-Score after\nfine-tuning on 100 labels",
                                         "GPT-4o F1-Score after\nfine-tuning on 250 labels"))) %>%
  dplyr::filter(Group != "chatGPT") %>% 
  ggplot(aes(x = Group, y = F1, fill = Type)) +
  geom_rect(aes(xmin = -Inf, xmax = 5.5, ymin = chatGPT_f1_lower, ymax = chatGPT_f1_upper), 
            fill = "lightblue", alpha = 0.2) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9, padding = 0.1)) +
  geom_errorbar(aes(ymin = F1LowerCI, ymax = F1UpperCI, group = interaction(Group, Type)), 
                stat = "identity",
                width = .5, alpha = 0.9,
                position = position_dodge(width = 0.9)) +
  geom_segment(aes(x = -Inf, xend = 5.4, y = chatGPT_f1, yend = chatGPT_f1), 
               color = "black", 
               linewidth = .66, 
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_f1_lower, yend = chatGPT_f1_lower), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_f1_upper, yend = chatGPT_f1_upper), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  labs(y = "F1-Score", x = "", fill = "") +
  #annotate("text", x = 5.60, y = chatGPT_f1, label = paste("GPT-4o-mini: ", scales::percent(chatGPT_f1)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  #scale_y_continuous(labels = scales::percent_format(), expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  annotate("text", x = 5.60, y = chatGPT_f1, label = paste("GPT-4o: ", round(chatGPT_f1, digits = 2)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  scale_y_continuous(expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_x_discrete(expand = expand_scale(mult = c(0.1, 0.2))) +
  scale_fill_brewer(palette = "Dark2") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "bottom",
        axis.text.y = element_text(size = 16, face = "bold", color = "black"),
        axis.text.x = element_text(size = 12, color = "black"),
        axis.title.x = element_text(size = 16, color = "black", face = "bold"),
        legend.text = element_text(size = 12),
        plot.margin = unit(c(2, .5, .5, .5), "lines"))

# Show plot
pl5

combined_plot <- cowplot::plot_grid(
  pl5, pl3, pl4, ncol = 1   
)

# Show the combined plot
print(combined_plot)

# Save plot
ggsave(plot = combined_plot, filename = "../img_gpt_4o/accuracy_groups_vs_experts_with_types_refined_and_baseline_facets_2_classes.pdf", width = 12, height = 16, dpi = 300, device = cairo_pdf)
ggsave(plot = combined_plot, filename = "../img_gpt_4o/accuracy_groups_vs_experts_with_types_refined_and_baseline_facets_2_classes.png", width = 12, height = 16, dpi = 300, bg = "White")
################################################################################
# Confusion Table all Finetuning Models and ZeroShot GPT
################################################################################
# Adjusting the function to handle both list structures
extract_and_aggregate_metrics <- function(conf_mat_list, is_no_fine_tuning = FALSE) {
  metrics <- lapply(conf_mat_list, function(model) {
    cm <- model[["Confusion Matrix"]][[1]]
    if (is_no_fine_tuning) {
      c(TP = sum(cm$N[cm$Pos_Yes == "TP"]),
        TN = sum(cm$N[cm$Pos_Yes == "TN"]),
        FP = sum(cm$N[cm$Pos_Yes == "FP"]),
        FN = sum(cm$N[cm$Pos_Yes == "FN"]))
    } else {
      c(TP = sum(cm$N[cm$Pos_1 == "TP"]),
        TN = sum(cm$N[cm$Pos_1 == "TN"]),
        FP = sum(cm$N[cm$Pos_1 == "FP"]),
        FN = sum(cm$N[cm$Pos_1 == "FN"]))
    }
  })
  do.call(rbind, metrics)
}

# Extracting and aggregating metrics for both sets
metrics_with_fine_tuning <- extract_and_aggregate_metrics(conf_mat_with_fine_tuning)
metrics_no_fine_tuning <- extract_and_aggregate_metrics(conf_mat_no_fine_tuning, TRUE)

# Combining both sets of metrics
all_metrics <- rbind(metrics_with_fine_tuning, metrics_no_fine_tuning)
all_metrics <- as.data.frame(all_metrics)
all_metrics$Name <- rownames(all_metrics)

all_metrics <- all_metrics %>% dplyr::filter(!Name %in% c("Experts 100", "Experts 250", "Research Assistants", "NGO", "Citizen Science", "Prolific", "Appen", "chatGPT 100", "chatGPT 250"))

# Assuming all_metrics is already a dataframe with the 'Name' column
all_metrics <- all_metrics %>%
  mutate(across(TP:FN, as.numeric)) %>% # Ensure all metric columns are numeric
  arrange(Name)


# To find and highlight max values within the dataframe, we can use a more direct approach
highlight_max <- function(df) {
  for (col in names(df)[2:5]) { # Assuming the first column is 'Name' and metrics follow
    max_value <- max(df[[col]], na.rm = TRUE)
    df[[col]] <- ifelse(df[[col]] == max_value, paste0("\\textbf{", df[[col]], "}"), df[[col]])
  }
  df
}

# Assuming all_metrics is already a dataframe with the 'Name' column and metrics converted to numeric
# First, ensure "ChatGPT" is listed first. One approach is to use a custom sorting function.
all_metrics <- all_metrics %>% select(Name, everything()) %>%
  arrange(match(Name, c("ChatGPT", unique(Name)[!unique(Name) %in% "ChatGPT"])))

# Assuming we're applying highlighting after sorting
all_metrics_highlighted <- highlight_max(all_metrics)

# Convert the highlighted metrics back to a dataframe if necessary
# and ensure the 'Name' column remains first
all_metrics_df <- as.data.frame(all_metrics_highlighted)
all_metrics_df <- all_metrics_df[, c("Name", setdiff(names(all_metrics_df), "Name"))]

# Generating LaTeX table
latex_table <- xtable(all_metrics_df, caption = "Confusion Matrix Metrics Summary", auto = FALSE)
print(latex_table, type = "latex", booktabs = TRUE,
      caption.placement = "top", include.rownames = FALSE, sanitize.text.function = function(x){x})
################################################################################
# Some other things
################################################################################


# Vector with names of the first dataset in each pair
datasets_group1 <- rep("chat_wide_m",  times = 10)
datasets_group1name <- rep("GPT Zero-Shot",  times = 10)
# Vector with names of the second dataset in each pair
datasets_group2 <- c("ngo_df_250", "appen_df_250", "citi_df_250", "prolific_df_250", "ras_df_250", "ngo_df_100", "appen_df_100", "citi_df_100", "prolific_df_100", "ras_df_100")


# Initialize a vector to store agreement rates
agreement_rates <- numeric(length(datasets_group1))
# Loop through the dataset names
for (i in 1:length(datasets_group1)) {
  # Dynamically access datasets using get()
  dataset1 <- get(datasets_group1[i]) %>% mutate(ishate_combined = ifelse(ishate_combined  == 0, "KEINE HATE SPEECH" , "HATE SPEECH"))
  dataset2 <- get(datasets_group2[i])
  
  # Calculate the agreement for the current pair of datasets
  agreement <- dataset1$ishate_combined == dataset2$prediction
  
  # Calculate and store the agreement rate for the current pair
  agreement_rates[i] <- mean(agreement) * 100
}

# Print the agreement rates for each pair
print(agreement_rates)

# Optionally, you might want to name the elements of agreement_rates for clarity
names(agreement_rates) <- paste(datasets_group1name, datasets_group2, sep=" vs ")
print(agreement_rates)



# Vector with names of the first dataset in each pair
datasets_group1 <- c("ngo_df_100", "appen_df_100", "citi_df_100", "prolific_df_100", "ras_df_100")
datasets_group1name <- c("ngo_df_100", "appen_df_100", "citi_df_100", "prolific_df_100", "ras_df_100")
# Vector with names of the second dataset in each pair
datasets_group2 <- c("ngo_df_250", "appen_df_250", "citi_df_250", "prolific_df_250", "ras_df_250")


# Initialize a vector to store agreement rates
agreement_rates <- numeric(length(datasets_group1))
# Loop through the dataset names
for (i in 1:length(datasets_group1)) {
  # Dynamically access datasets using get()
  dataset1 <- get(datasets_group1[i])
  dataset2 <- get(datasets_group2[i])
  
  # Calculate the agreement for the current pair of datasets
  agreement <- dataset1$prediction == dataset2$prediction
  
  # Calculate and store the agreement rate for the current pair
  agreement_rates[i] <- mean(agreement) * 100
}

# Print the agreement rates for each pair
print(agreement_rates)

# Optionally, you might want to name the elements of agreement_rates for clarity
names(agreement_rates) <- paste(datasets_group1name, datasets_group2, sep=" vs ")
print(agreement_rates)



results_consolidated_df <- df_1 %>% mutate(fine_tune = 0) %>% dplyr::select(-c(difficult_case,unit_var,titel,text,target_combined,
                                                                                  ishatespeech_1, ishatespeech_2, ishatespeech_3,
                                                                                  target_group_1, target_group_2, target_group_3,
                                                                                  )) %>% dplyr::rename(hatespeech = ishate_combined)


colnames(expert_df_100) <- tolower(names(expert_df_100)) 
expert_df_100 <- expert_df_100 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                    openai_instance_format, openai_instance_without_completion)) %>% 
                                   dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
                                   dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                                                 group = "experts",
                                                 fine_tune = 100)

colnames(expert_df_250) <- tolower(names(expert_df_250)) 
expert_df_250 <- expert_df_250 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                    openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "experts",
                fine_tune = 250)

colnames(appen_df_100) <- tolower(names(appen_df_100)) 
appen_df_100 <- appen_df_100 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                    openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "appen",
                fine_tune = 100)

colnames(appen_df_250) <- tolower(names(appen_df_250)) 
appen_df_250 <- appen_df_250 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                  openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "appen",
                fine_tune = 250)

colnames(citi_df_100) <- tolower(names(citi_df_100)) 
citi_df_100 <- citi_df_100 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                  openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "citi",
                fine_tune = 100)

colnames(citi_df_250) <- tolower(names(citi_df_250)) 
citi_df_250 <- citi_df_250 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                  openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "citi",
                fine_tune = 250)

colnames(gpt_df_100) <- tolower(names(gpt_df_100)) 
gpt_df_100 <- gpt_df_100 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "chatGPT",
                fine_tune = 100)

colnames(gpt_df_250) <- tolower(names(gpt_df_250)) 
gpt_df_250 <- gpt_df_250 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "chatGPT",
                fine_tune = 250)

colnames(ngo_df_100) <- tolower(names(ngo_df_100)) 
ngo_df_100 <- ngo_df_100 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                              openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "af",
                fine_tune = 100)

colnames(ngo_df_250) <- tolower(names(ngo_df_250)) 
ngo_df_250 <- ngo_df_250 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                              openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "af",
                fine_tune = 250)

colnames(prolific_df_100) <- tolower(names(prolific_df_100)) 
prolific_df_100 <- prolific_df_100 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                              openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "Prolific",
                fine_tune = 100)

colnames(prolific_df_250) <- tolower(names(prolific_df_250)) 
prolific_df_250 <- prolific_df_250 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                              openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "Prolific",
                fine_tune = 250)

colnames(ras_df_100) <- tolower(names(ras_df_100)) 
ras_df_100 <- ras_df_100 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                        openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "ra",
                fine_tune = 100)

colnames(ras_df_250) <- tolower(names(ras_df_250)) 
ras_df_250 <- ras_df_250 %>% dplyr::select(-c(zero_shot_prompt, user_prompt, completion_label, 
                                                        openai_instance_format, openai_instance_without_completion)) %>% 
  dplyr::rename(kommentar = text, hatespeech = prediction) %>% 
  dplyr::mutate(hatespeech = ifelse(hatespeech == "HATE SPEECH", 1, 0),
                group = "ra",
                fine_tune = 250)

results_consolidated_df <- dplyr::bind_rows(results_consolidated_df, 
                                            expert_df_100, expert_df_250,
                                            appen_df_100, appen_df_250,
                                            citi_df_100, citi_df_250,
                                            gpt_df_100, gpt_df_250,
                                            ngo_df_100, ngo_df_250,
                                            prolific_df_100,prolific_df_250,
                                            ras_df_100,ras_df_250)
results_consolidated_df$label_column <- NULL

experts_wide_minimized <- experts_wide %>% dplyr::select(c(articleid,id,target_combined))
results_consolidated_df <- results_consolidated_df %>% dplyr::left_join(., experts_wide_minimized, by = c("articleid", "id"))

write_csv(results_consolidated_df, "../tables/table_all_classifications_2_classes.csv")
